Note: run intall.packages() before loading the packages.
# general data maniputlation: summarise, filter, etc.
#install.packages("dplyr")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#install.packages("plyr")
library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
# manipulation of date/time data
#install.packages("chron")
library(chron)
# interactive plots
#install.packages("scatterD3")
library(scatterD3)
#install.packages("plotly")
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:graphics':
##
## layout
nba_scores <- "/home/rstudio/NBA/scores_nba.test.dat"
lines <- readLines(nba_scores)
head(lines)
## [1] "2016-04-05,15:06:16,Phoenix,0,Atlanta,0,(8:00 PM ET),48.0,400829044"
## [2] "2016-04-05,15:06:16,Chicago,0,Memphis,0,(8:00 PM ET),48.0,400829045"
## [3] "2016-04-05,15:06:16,Cleveland,0,Milwaukee,0,(8:00 PM ET),48.0,400829046"
## [4] "2016-04-05,15:06:16,Oklahoma City,0,Denver,0,(9:00 PM ET),48.0,400829047"
## [5] "2016-04-05,15:06:16,New Orleans,0,Philadelphia,0,(7:00 PM ET),48.0,400829041"
## [6] "2016-04-05,15:06:16,Detroit,0,Miami,0,(8:00 PM ET),48.0,400829042"
# turn the .dat file to dataframe
nba_scores_DF <- as.data.frame(do.call(rbind, strsplit(lines, ",")), stringsAsFactors=FALSE)
## Warning in (function (..., deparse.level = 1) : number of columns of result
## is not a multiple of vector length (arg 14628)
# Since I don't have a header in the data set, I want to specify the column metadata
colnames(nba_scores_DF) <- c("dateOrig","ts","teamlonga", "scorea", "teamlongb", "scoreb", "timestring", "timeleft", "gameid")
nba_scores_DF2 <- transform(nba_scores_DF, dateOrig = as.Date(dateOrig),
ts = as.character(ts),
teamlonga = as.character(teamlonga),
scorea = as.numeric (scorea),
teamlongb = as.character(teamlongb),
scoreb = as.numeric (scoreb),
timestring = as.character(timestring),
timeleft = as.numeric(timeleft),
gameid = as.character(gameid))
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
This data is the raw input that contains a record for each update of the game. Data has some errors and redundancies that must be removed. Will discuss that as we go along …. in particular, we need to seperate the in game scores and the final score and re-merge them for our model
# NAs are introduced because the raw data has invalid data points, so remove these observations
rtscoresAndFinalDF <- na.omit(nba_scores_DF2)
dim(rtscoresAndFinalDF) #16746 9
## [1] 16746 9
head(rtscoresAndFinalDF)
## dateOrig ts teamlonga scorea teamlongb scoreb
## 1 2016-04-05 15:06:16 Phoenix 0 Atlanta 0
## 2 2016-04-05 15:06:16 Chicago 0 Memphis 0
## 3 2016-04-05 15:06:16 Cleveland 0 Milwaukee 0
## 4 2016-04-05 15:06:16 Oklahoma City 0 Denver 0
## 5 2016-04-05 15:06:16 New Orleans 0 Philadelphia 0
## 6 2016-04-05 15:06:16 Detroit 0 Miami 0
## timestring timeleft gameid
## 1 (8:00 PM ET) 48 400829044
## 2 (8:00 PM ET) 48 400829045
## 3 (8:00 PM ET) 48 400829046
## 4 (9:00 PM ET) 48 400829047
## 5 (7:00 PM ET) 48 400829041
## 6 (8:00 PM ET) 48 400829042
head(filter(rtscoresAndFinalDF, grepl("FINAL", timestring)))
## dateOrig ts teamlonga scorea teamlongb scoreb timestring
## 1 2016-04-05 21:22:09 New Orleans 93 Philadelphia 107 (FINAL)
## 2 2016-04-05 22:08:42 Charlotte 90 Toronto 96 (FINAL)
## 3 2016-04-05 22:25:25 Chicago 92 Memphis 108 (FINAL)
## 4 2016-04-05 22:28:58 Phoenix 90 Atlanta 103 (FINAL)
## 5 2016-04-05 22:30:29 Cleveland 109 Milwaukee 80 (FINAL)
## 6 2016-04-05 22:30:29 Detroit 89 Miami 107 (FINAL)
## timeleft gameid
## 1 0 400829041
## 2 0 400829043
## 3 0 400829045
## 4 0 400829044
## 5 0 400829046
## 6 0 400829042
head(filter(rtscoresAndFinalDF, grepl("1ST", timestring)))
## dateOrig ts teamlonga scorea teamlongb scoreb timestring
## 1 2016-04-05 19:23:42 New Orleans 23 Philadelphia 12 (4:39 IN 1ST)
## 2 2016-04-05 19:23:57 New Orleans 23 Philadelphia 14 (4:05 IN 1ST)
## 3 2016-04-05 19:24:13 New Orleans 23 Philadelphia 14 (3:41 IN 1ST)
## 4 2016-04-05 19:24:28 New Orleans 23 Philadelphia 14 (3:32 IN 1ST)
## 5 2016-04-05 19:24:43 New Orleans 23 Philadelphia 16 (3:24 IN 1ST)
## 6 2016-04-05 19:25:29 New Orleans 23 Philadelphia 16 (3:11 IN 1ST)
## timeleft gameid
## 1 40.65000 400829041
## 2 40.08333 400829041
## 3 39.68333 400829041
## 4 39.53333 400829041
## 5 39.40000 400829041
## 6 39.18333 400829041
These were a couple custom UDF’s I needed to cleanse the data and also to add a few features based on a proprietary way of combining the score with the time left.
# Function to turn long team name to short
teamMap <- function(x) {
tnames <- data.frame(
long = as.factor(c("Atlanta", "Boston", "Brooklyn", "Charlotte", "Chicago",
"Cleveland", "Dallas", "Denver", "Detroit", "Golden State",
"Houston","Indiana", "LA Clippers", "LA Lakers", "Memphis",
"Miami", "Milwaukee", "Minnesota", "New Orleans", "New York",
"Oklahoma City", "Orlando", "Philadelphia", "Phila.", "Phoenix",
"Portland", "Sacramento", "San Antonio", "Toronto", "Utah", "Washington")),
short = as.factor(c("atl", "bos", "bkn", "cha", "chi",
"cle", "dal", "den", "det", "gst",
"hou", "ind", "lac", "lal", "mem",
"mia", "mil", "min", "nor", "nyk",
"okc", "orl", "phi", "phi", "pho",
"por", "sac", "san", "tor", "uta", "wsh"))
)
df_x <- data.frame(long=x)
short <- tnames$short[match(df_x$long, tnames$long)]
return(short)
}
# Function to convert 3-character month to 2-digit numeric month
monthMap <-function(x) {
a <-data.frame(
str = as.factor(c("Jan", "Feb", "Mar", "Apr", "May",
"Jun", "Jul", "Aug", "Sep", "Oct",
"Nov", "Dec")),
num = as.factor(c("01", "02", "03", "04", "05",
"06", "07", "08", "09", "10",
"11", "12"))
)
df_x <- data.frame(str=x)
num <- a$num[match(df_x$str, a$str)]
return(num)
}
# Date Logic to adjust for games that finish on the day after ....
# This is due to not having a great key to join my tables ...
dateadjustudf <- function(datein, tsin){
newdate <- c()
for (i in 1:length(tsin)){
if (grepl("^0[0-3]", tsin[i])) {
newdate[i] = datein[i] - 1
} else {
newdate[i] = datein[i]
}
}
return(newdate)
}
# UDFs to create some extra features ... this one is for an experiemental combination of Time left and Score difference.
# Made this via intuition. This can be extended to add other custom features
# val crossOverTime = 8
# val exponentScaler = 0.5
# There is no need to create UDFs here
Here I create some extra columns for later use.
# Remove Overtime games from this analysis
rtscoresAndFinalDF <- filter(rtscoresAndFinalDF, !grepl(".*OT.*", timestring))
#16626
# Create short 3 character team names
rtscoresAndFinalDF$teama <- teamMap(rtscoresAndFinalDF$teamlonga)
rtscoresAndFinalDF$teamb <- teamMap(rtscoresAndFinalDF$teamlongb)
# Add a score differential Column
rtscoresAndFinalDF$scorea_scoreb <- rtscoresAndFinalDF$scorea - rtscoresAndFinalDF$scoreb
# Transform the Date. This is for games that spanned multiple days and gave me a headache.
# Games adjusted to the day they started on.
rtscoresAndFinalDF$date <- dateadjustudf(rtscoresAndFinalDF$dateOrig, rtscoresAndFinalDF$ts)
rtscoresAndFinalDF$date <- as.Date(rtscoresAndFinalDF$date, origin = "1970-01-01")
# Create a Key for me to use to join with odds data later. Key = date.teama.teamb
for (i in 1:nrow(rtscoresAndFinalDF)){
rtscoresAndFinalDF$key[i] <- paste0(rtscoresAndFinalDF$date[i], ".", rtscoresAndFinalDF$teama[i], ".", rtscoresAndFinalDF$teamb[i])
}
Currently based on the way the data was sampled, both real time scores and final scores are written as seperate records to the same file. I need to pull these apart, and then join the dataframes so that I have a real time score and features and know if the game was won or lost ….
# Create Final Score DF
# Note a shortcut for repeating the dataframe within the filter is to use a $df.filter(df("foo").contains ... is equiv to df.filter($"foo".contains)
finalscoresDF <- filter(rtscoresAndFinalDF, grepl("FINAL", timestring))
# Rename some columns so that join later doesnt have name overlaps
finalscoresDF$fscorea <- finalscoresDF$scorea
finalscoresDF$fscoreb <- finalscoresDF$scoreb
# Create final score difference
finalscoresDF$fscorea_fscoreb <- finalscoresDF$fscorea - finalscoresDF$fscoreb
finalscoresDF$fscoreb_fscorea <- finalscoresDF$fscoreb - finalscoresDF$fscorea
# Add a Win/loss column Win = 1, Loss = 0
for (i in 1 : nrow(finalscoresDF)){
if (finalscoresDF$fscorea_fscoreb[i] > 0){
finalscoresDF$home_win[i] <- 0
finalscoresDF$away_win[i] <- 1
} else {
finalscoresDF$home_win[i] <- 1
finalscoresDF$away_win[i] <- 0
}
}
#################################################################################################################
# Create Real time score DF and more wrangling
# Remove Halftime records and these other cases as my datasource doesnt always change the quarter well
# as this particular case isn't handled well... (for now)
rtscoresDF <- filter(rtscoresAndFinalDF, !grepl('HALF', timestring), !grepl('FINAL', timestring),
timestring != "(12:00 IN 1ST)" ,
timestring != "(12:00 IN 2ND)" ,
timestring != "(12:00 IN 3RD)" ,
timestring != "(12:00 IN 4TH)" ,
timestring != "(END OF 1ST)" ,
timestring != "(END OF 2ND)" ,
timestring != "(END OF 3RD)" ,
timestring != "(END OF 4TH)" )
# Create real time score difference
rtscoresDF$scorea_scoreb <- rtscoresDF$scorea - rtscoresDF$scoreb
rtscoresDF$scoreb_scorea <- rtscoresDF$scoreb - rtscoresDF$scorea
# Create a game PCT complete and PCT left indictor
rtscoresDF$pct_complete <- (((rtscoresDF$timeleft * -1) + 48 )/48.0)*100
rtscoresDF$pct_left <- 100 - rtscoresDF$pct_complete
# Create a unique feature. Idea here is that I have intuition that timeleft and score difference are a strong predictor when combined
rtscoresDF$cf1 <- (1/((rtscoresDF$pct_left/25 + .01)^.5)) * rtscoresDF$scoreb_scorea
rtscoresDF$cf2 <- (1/((rtscoresDF$pct_left/2.0 + .01)^1.3))*rtscoresDF$scoreb_scorea
After building my initial model, I noticed that the logistic model was adjusting the probabilities well at the end of the games. I had some examples where I had 0 time left in the game, and yet the logistic model was giving a 70% chance of victory for a team. I speculated this was due to the fact that my original features were not fitting the end of game very well. To fix this, I created a spreader custom feature that basically takes the score difference and amplifies it as the score nears the end of the game. This way this feature is very predictive at the end of games and can help adjust the probablities to be more certain at the end of games.
# subset a dataframe for scatterplot
spreader <- filter(rtscoresDF, pct_complete < 95)
# draw interactive scatter plot
scatterD3(x = spreader$pct_complete, y = spreader$scoreb_scorea, col_var = spreader$key)
scatterD3(x = spreader$pct_complete, y = spreader$cf2, col_var = spreader$key)
***
# Some Printouts .....
print("final scores data frame")
## [1] "final scores data frame"
head(finalscoresDF)
## dateOrig ts teamlonga scorea teamlongb scoreb timestring
## 1 2016-04-05 21:22:09 New Orleans 93 Philadelphia 107 (FINAL)
## 2 2016-04-05 22:08:42 Charlotte 90 Toronto 96 (FINAL)
## 3 2016-04-05 22:25:25 Chicago 92 Memphis 108 (FINAL)
## 4 2016-04-05 22:28:58 Phoenix 90 Atlanta 103 (FINAL)
## 5 2016-04-05 22:30:29 Cleveland 109 Milwaukee 80 (FINAL)
## 6 2016-04-05 22:30:29 Detroit 89 Miami 107 (FINAL)
## timeleft gameid teama teamb scorea_scoreb date
## 1 0 400829041 nor phi -14 2016-04-05
## 2 0 400829043 cha tor -6 2016-04-05
## 3 0 400829045 chi mem -16 2016-04-05
## 4 0 400829044 pho atl -13 2016-04-05
## 5 0 400829046 cle mil 29 2016-04-05
## 6 0 400829042 det mia -18 2016-04-05
## key fscorea fscoreb fscorea_fscoreb fscoreb_fscorea
## 1 2016-04-05.nor.phi 93 107 -14 14
## 2 2016-04-05.cha.tor 90 96 -6 6
## 3 2016-04-05.chi.mem 92 108 -16 16
## 4 2016-04-05.pho.atl 90 103 -13 13
## 5 2016-04-05.cle.mil 109 80 29 -29
## 6 2016-04-05.det.mia 89 107 -18 18
## home_win away_win
## 1 1 0
## 2 1 0
## 3 1 0
## 4 1 0
## 5 0 1
## 6 1 0
paste0("Total Games = ", nrow(finalscoresDF))
## [1] "Total Games = 116"
print("real time scores data frame")
## [1] "real time scores data frame"
head(rtscoresDF)
## dateOrig ts teamlonga scorea teamlongb scoreb
## 1 2016-04-05 15:06:16 Phoenix 0 Atlanta 0
## 2 2016-04-05 15:06:16 Chicago 0 Memphis 0
## 3 2016-04-05 15:06:16 Cleveland 0 Milwaukee 0
## 4 2016-04-05 15:06:16 Oklahoma City 0 Denver 0
## 5 2016-04-05 15:06:16 New Orleans 0 Philadelphia 0
## 6 2016-04-05 15:06:16 Detroit 0 Miami 0
## timestring timeleft gameid teama teamb scorea_scoreb date
## 1 (8:00 PM ET) 48 400829044 pho atl 0 2016-04-05
## 2 (8:00 PM ET) 48 400829045 chi mem 0 2016-04-05
## 3 (8:00 PM ET) 48 400829046 cle mil 0 2016-04-05
## 4 (9:00 PM ET) 48 400829047 okc den 0 2016-04-05
## 5 (7:00 PM ET) 48 400829041 nor phi 0 2016-04-05
## 6 (8:00 PM ET) 48 400829042 det mia 0 2016-04-05
## key scoreb_scorea pct_complete pct_left cf1 cf2
## 1 2016-04-05.pho.atl 0 0 100 0 0
## 2 2016-04-05.chi.mem 0 0 100 0 0
## 3 2016-04-05.cle.mil 0 0 100 0 0
## 4 2016-04-05.okc.den 0 0 100 0 0
## 5 2016-04-05.nor.phi 0 0 100 0 0
## 6 2016-04-05.det.mia 0 0 100 0 0
paste0("Total Number of rt score records = ", nrow(rtscoresDF))
## [1] "Total Number of rt score records = 15947"
How to Read the Raw Odds data
Example Golden State -12.5 O (207.0) -125.0 | Detroit 12.5 U (207.0) 145.0
The away team is listed first, and the home team is second
Here Golden State is a 12.5 pt favorite to win. The over under is in parentheses (207) and is the 50/50 line between teams sum of scores
being above/below that line.
Finally the -125 / +145 numbers are whats known at the moneyline odds.
A negative number means you need to bet 125$ to get a 100$ payout
A positive number means you need to bet 100$ to get a 145$ payout
nba_odds <- "/home/rstudio/NBA/nbaodds_042516.xml"
xml <- as.list(readLines(nba_odds))
head(xml)
## [[1]]
## [1] "basketball_nba.040516.xml: <title>New Orleans 2.5 O (207.0) 125.0 | Phila. -2.5 U (207.0) -145.0 (Apr 05, 2016 07:10 PM)</title>"
##
## [[2]]
## [1] "basketball_nba.040516.xml: <title>Detroit 4.0 O (202.0) 160.0 | Miami -4.0 U (202.0) -190.0 (Apr 05, 2016 08:05 PM)</title>"
##
## [[3]]
## [1] "basketball_nba.040516.xml: <title>Charlotte 4.0 O (200.5) 155.0 | Toronto -4.0 U (200.5) -175.0 (Apr 05, 2016 07:40 PM)</title>"
##
## [[4]]
## [1] "basketball_nba.040516.xml: <title>Phoenix 14.5 O (207.5) -110.0 | Atlanta -14.5 U (207.5) -110.0 (Apr 05, 2016 08:10 PM)</title>"
##
## [[5]]
## [1] "basketball_nba.040516.xml: <title>Chicago -3.0 O (201.5) -150.0 | Memphis 3.0 U (201.5) 130.0 (Apr 05, 2016 08:10 PM)</title>"
##
## [[6]]
## [1] "basketball_nba.040516.xml: <title>Cleveland -7.5 O (203.0) -340.0 | Milwaukee 7.5 U (203.0) 280.0 (Apr 05, 2016 08:10 PM)</title>"
# use regular expression to catch info we need
odds <- lapply(xml, function(x) substr(x, regexpr(">", x) + 1, regexpr("/", x) - 2))
odds_split <- lapply(odds, function(x) unlist(strsplit(x, " ")))
# get teamlonga
teamlonga_0 <- lapply(odds_split, function(x) paste(x[1], x[2]))
teamlonga <- lapply(teamlonga_0, function(x){
if (regexpr("[0-9|-]", x) > -1) {
substr(x, 1, regexpr("[0-9|-]", x)-2)
} else{
x
}
})
# get teamlongb
teamlongb_0 <- lapply(odds_split, function(x) paste(x[7],x[8], x[9]))
teamlongb_1 <- lapply(teamlongb_0, function(x){
if (regexpr("[0-9]", x) > -1) {
substr(x, regexpr("[A-Za-z]", x), regexpr("[0-9-]", x)-2)
} else{
x
}
})
teamlongb <- lapply(teamlongb_1, function(x){
if (regexpr("|", x) > -1){
substr(x, regexpr("[A-Za-z]", x), nchar(x))
} else {
x
}
})
# teamaspread
teamaspread_0 <- lapply(odds, function(x){
substr(x, regexpr("[0-9-]",x), regexpr("[0-9-]",x)+4)
})
teamaspread <- lapply(teamaspread_0, function(x){
if (regexpr("[ ]", x) > 0){
substr(x, 1, regexpr("[ ]", x)-1)
} else {
x
}
})
# overunder
overunder <- lapply(odds, function(x){
substr(x, regexpr("[(]", x) + 1, regexpr("[)]", x) - 1)
})
# teamaml
teamaml <- lapply(odds, function(x){
substr(x,regexpr("[)]", x) + 2, regexpr("[|]", x) - 2 )
})
# teambml
teambml <- lapply(odds, function(x){
substr(x, gregexpr("[)]", x)[[1]][2]+2, gregexpr("[(]", x)[[1]][3]-2)
})
#get date
dateStr <- lapply(odds, function(x){
month <- substr(x, gregexpr("[(]", x)[[1]][3]+1, gregexpr("[(]", x)[[1]][3]+3)
month_num <- monthMap(month)
date <- substr(x, gregexpr("[(]", x)[[1]][3]+5, gregexpr("[(]", x)[[1]][3]+6)
year <- substr(x, gregexpr("[(]", x)[[1]][3]+9, gregexpr("[(]", x)[[1]][3]+12)
paste0(year, "-", month_num, "-", date)
})
# get short team names
teama <- lapply(teamlonga, teamMap)
teamb <- lapply(teamlongb, teamMap)
# bind all column together into dataframe
oddsDF <- do.call(rbind, Map(data.frame, teamlonga=teamlonga, teama=teama, teamlongb=teamlongb, teamb=teamb, teamaspread=teamaspread, overunder=overunder, teamaml=teamaml, teambml=teambml, dateStr=dateStr))
# change to right data type and create a key for join later
oddsDF$teamaspread <- as.numeric(as.character(oddsDF$teamaspread))
oddsDF$overunder <- as.numeric(as.character(oddsDF$overunder))
oddsDF$teamaml <- as.numeric(as.character(oddsDF$teamaml))
oddsDF$teambml <- as.numeric(as.character(oddsDF$teambml))
oddsDF$teama <- as.character(oddsDF$teama)
oddsDF$teamb <- as.character(oddsDF$teamb)
oddsDF$key <- paste0(oddsDF$dateStr, ".", oddsDF$teama, ".", oddsDF$teamb)
dim(oddsDF) #161 10
## [1] 161 10
# add the groupby and average below because I was getting the game odds over multiple days, and it was adding noise to the analysis
oddsDF2 <- ddply(oddsDF, c("key", "teamlonga", "teamlongb", "teama", "teamb", "dateStr"), summarise,
teamaspread = mean(teamaspread),
overunder = mean(overunder),
teamaml = mean(teamaml),
teambml = mean(teambml))
# Create a few new columns for later analysis
oddsDF2$teambspread <- oddsDF2$teamaspread * -1
oddsDF2$teama_vegas_fscore <- (oddsDF2$overunder / 2.0) - (oddsDF2$teamaspread / 2.0)
oddsDF2$teamb_vegas_fscore <- (oddsDF2$overunder / 2.0) + (oddsDF2$teamaspread / 2.0)
head(oddsDF2)
## key teamlonga teamlongb teama teamb dateStr
## 1 2016-04-05.cha.tor Charlotte Toronto cha tor 2016-04-05
## 2 2016-04-05.chi.mem Chicago Memphis chi mem 2016-04-05
## 3 2016-04-05.cle.mil Cleveland Milwaukee cle mil 2016-04-05
## 4 2016-04-05.det.mia Detroit Miami det mia 2016-04-05
## 5 2016-04-05.lal.lac LA Lakers LA Clippers lal lac 2016-04-05
## 6 2016-04-05.min.gst Minnesota Golden State min gst 2016-04-05
## teamaspread overunder teamaml teambml teambspread teama_vegas_fscore
## 1 4.0 200.5 155 -175 -4.0 98.25
## 2 -3.0 201.5 -150 130 3.0 102.25
## 3 -7.5 203.0 -340 280 7.5 105.25
## 4 4.0 202.0 160 -190 -4.0 99.00
## 5 14.5 208.0 -110 -110 -14.5 96.75
## 6 15.5 225.0 -110 -110 -15.5 104.75
## teamb_vegas_fscore
## 1 102.25
## 2 99.25
## 3 97.75
## 4 103.00
## 5 111.25
## 6 120.25
paste("total home teams = ", length(unique(oddsDF2$teama)))
## [1] "total home teams = 30"
paste("total away teams = ", length(unique(oddsDF2$teamb)))
## [1] "total away teams = 30"
paste("total games collected = ", nrow(oddsDF2))
## [1] "total games collected = 111"
Here we are averaging the away spread per team. If the bar is above the zero line, then the team is an underdog, and under the line the team is the favorite. 8 of the 32 teams were favorites on the road… and they are the likely suspect including CLE/GST/OKC
# visualize away spread data
avg_away_spread <- ddply(oddsDF2, c("teamlonga", "teamlongb"), summarise,
awayspread_avg_teamaspread = mean(teamaspread),
awayspread_avg_teambspread = mean(teambspread))
# away spread group by teama
away_spread_teama <- ddply(avg_away_spread, c("teamlonga"), summarise,
teamaspread = mean(awayspread_avg_teamaspread))
# order by teama
away_spread_teama$teamlonga <- as.character(away_spread_teama$teamlonga)
away_spread_teama <- away_spread_teama[order(away_spread_teama$teamlonga), ]
# barchart
p <- plot_ly(
x = away_spread_teama$teamlonga,
y = away_spread_teama$teamaspread,
type = "bar")
p
Here we are averaging the home spread per team. If the bar is above the zero line, then the team is an underdog, and under the line the team is the favorite. Note here that the home teams are favored much more, with the usual suspects having a very large advantage (SAN/GST/OKC)
# spread group by teamb
away_spread_teamb <- ddply(avg_away_spread, c("teamlongb"), summarise,
teambspread = mean(awayspread_avg_teambspread))
# order by teamb
away_spread_teamb$teamlongb <- as.character(away_spread_teamb$teamlongb)
away_spread_teamb <- away_spread_teamb[order(away_spread_teamb$teamlongb), ]
p <- plot_ly(
x = away_spread_teamb$teamlongb,
y = away_spread_teamb$teambspread,
type = "bar")
p
# Here is where we join the Odds/Realtime scores/ Final Scores into one wholistic data set as input for Logistic Machine Learning
# Create a smaller Final Score Dataframe. Just keep the key, final score a and b, the win/loss indicator
finalslicedscoresDF <- finalscoresDF[c("key","fscorea", "fscoreb", "fscorea_fscoreb", "fscoreb_fscorea", "away_win", "home_win")]
# First Join the 2 smallest data frames ... odd and final.
gameDF <- merge(finalslicedscoresDF, oddsDF2, by = "key")
gameDF$teamlonga <- NULL
gameDF$teamlongb <- NULL
gameDF$teama <- NULL
gameDF$teamb <- NULL
# Print Out the Game Dataframe ... notice we have the odds data merged with the win loss data ....
print("gameDF")
## [1] "gameDF"
head(gameDF)
## key fscorea fscoreb fscorea_fscoreb fscoreb_fscorea
## 1 2016-04-05.cha.tor 90 96 -6 6
## 2 2016-04-05.chi.mem 92 108 -16 16
## 3 2016-04-05.cle.mil 109 80 29 -29
## 4 2016-04-05.det.mia 89 107 -18 18
## 5 2016-04-05.lal.lac 81 103 -22 22
## 6 2016-04-05.nor.phi 93 107 -14 14
## away_win home_win dateStr teamaspread overunder teamaml teambml
## 1 0 1 2016-04-05 4.0 200.5 155 -175
## 2 0 1 2016-04-05 -3.0 201.5 -150 130
## 3 1 0 2016-04-05 -7.5 203.0 -340 280
## 4 0 1 2016-04-05 4.0 202.0 160 -190
## 5 0 1 2016-04-05 14.5 208.0 -110 -110
## 6 0 1 2016-04-05 2.5 207.0 125 -145
## teambspread teama_vegas_fscore teamb_vegas_fscore
## 1 -4.0 98.25 102.25
## 2 3.0 102.25 99.25
## 3 7.5 105.25 97.75
## 4 -4.0 99.00 103.00
## 5 -14.5 96.75 111.25
## 6 -2.5 102.25 104.75
paste("total games collected:", nrow(gameDF)) #103
## [1] "total games collected: 103"
# Here we show that the better a team is (negative spread, the more they are likely to win ...)
#Here the spread at the start of the game is a decent predictor regarding the end result
# Final Score Difference vs Spread
# Top Left indicates teams with a large pos spread will lose by a wider margin
# the line should approx pass through 0,0
# lower Right indicates teams with large neg spread will win by a wider margin
# The logistic and linear models we build will quantify this for us later!
scatterD3(x = gameDF$fscoreb_fscorea, y = gameDF$teamaspread)
# Here we can show another weak correlation of the vegas overunder/spread to the actual final outcome.
# vegas_fscore was calculated by taking overunder/2 +- the spread/2 to get a projection of
# the home/away teams score
# Here if the prediction and data were perfectly correlated, we would pass through the
# y=x line. in general we follow that path
# we will see how this term plays when we dig into the linear model
# here only home team is shown, but same trend holds for away team
scatterD3(x = gameDF$teamb_vegas_fscore, y = gameDF$fscoreb)
# This is the bigger merge. Merging the odds/final score data with the real time indicators ...
lrDF <- merge(gameDF, rtscoresDF, by = "key")
print("lrDF : Logistic Regression Data Frame")
## [1] "lrDF : Logistic Regression Data Frame"
head(lrDF)
## key fscorea fscoreb fscorea_fscoreb fscoreb_fscorea
## 1 2016-04-05.cha.tor 90 96 -6 6
## 2 2016-04-05.cha.tor 90 96 -6 6
## 3 2016-04-05.cha.tor 90 96 -6 6
## 4 2016-04-05.cha.tor 90 96 -6 6
## 5 2016-04-05.cha.tor 90 96 -6 6
## 6 2016-04-05.cha.tor 90 96 -6 6
## away_win home_win dateStr teamaspread overunder teamaml teambml
## 1 0 1 2016-04-05 4 200.5 155 -175
## 2 0 1 2016-04-05 4 200.5 155 -175
## 3 0 1 2016-04-05 4 200.5 155 -175
## 4 0 1 2016-04-05 4 200.5 155 -175
## 5 0 1 2016-04-05 4 200.5 155 -175
## 6 0 1 2016-04-05 4 200.5 155 -175
## teambspread teama_vegas_fscore teamb_vegas_fscore dateOrig ts
## 1 -4 98.25 102.25 2016-04-05 20:32:49
## 2 -4 98.25 102.25 2016-04-05 20:56:38
## 3 -4 98.25 102.25 2016-04-05 20:08:41
## 4 -4 98.25 102.25 2016-04-05 20:26:10
## 5 -4 98.25 102.25 2016-04-05 20:32:34
## 6 -4 98.25 102.25 2016-04-05 19:49:35
## teamlonga scorea teamlongb scoreb timestring timeleft gameid
## 1 Charlotte 42 Toronto 52 (1:12 IN 2ND) 25.20000 400829043
## 2 Charlotte 44 Toronto 58 (8:56 IN 3RD) 20.93333 400829043
## 3 Charlotte 19 Toronto 28 (11:08 IN 2ND) 35.13333 400829043
## 4 Charlotte 35 Toronto 41 (4:11 IN 2ND) 28.18333 400829043
## 5 Charlotte 40 Toronto 52 (1:30 IN 2ND) 25.50000 400829043
## 6 Charlotte 9 Toronto 12 (7:18 IN 1ST) 43.30000 400829043
## teama teamb scorea_scoreb date scoreb_scorea pct_complete pct_left
## 1 cha tor -10 2016-04-05 10 47.500000 52.50000
## 2 cha tor -14 2016-04-05 14 56.388889 43.61111
## 3 cha tor -9 2016-04-05 9 26.805556 73.19444
## 4 cha tor -6 2016-04-05 6 41.284722 58.71528
## 5 cha tor -12 2016-04-05 12 46.875000 53.12500
## 6 cha tor -3 2016-04-05 3 9.791667 90.20833
## cf1 cf2
## 1 6.884284 0.14286204
## 2 10.569592 0.25452584
## 3 5.250891 0.08348447
## 4 3.906817 0.07411763
## 5 8.212631 0.16881814
## 6 1.577128 0.02120872
paste("total data points collected:", nrow(lrDF)) #13412
## [1] "total data points collected: 13412"
# Add an overunder/spread adjusted projection as points are scored during the game
# I found this is a strong indicator
lrDF$teama_adj_fscore <- ((lrDF$pct_complete * -1)/100 + 1) * lrDF$teama_vegas_fscore + lrDF$scorea
lrDF$teamb_adj_fscore <- ((lrDF$pct_complete * -1)/100 + 1) * lrDF$teamb_vegas_fscore + lrDF$scoreb
lrDF$pfscoreb_pfscorea <- lrDF$teamb_adj_fscore - lrDF$teama_adj_fscore
# There is an issue with the data I had captured. When a quarter transitions from 1st->2nd (etc,etc), sometime the timestring doesn't get updated properly. Since I used the timestring to calculate the timeleft in the game, I would get some rogue data points.
# Example, after 1 min in a game, something the two teams would have scores in the 20's, because it was really at 11 mins in the second quarter.
# My solution was to use the final score sum, and then just scale that down to the time left in the game. I would then compare to the sum of scores i had, and if it was significantly higher, I would remove them. I did this by visual inspection ...
# dfa = departure_from_avg
lrDF$dfa <- (lrDF$fscorea + lrDF$fscoreb)/48 * (lrDF$timeleft * -1 + 48) - (lrDF$scorea + lrDF$scoreb)
lrDF_filtered <- filter(lrDF, dfa > -30)
summary(lrDF_filtered)
## key fscorea fscoreb fscorea_fscoreb
## Length:13217 Min. : 68.00 Min. : 80.0 Min. :-38.00
## Class :character 1st Qu.: 92.00 1st Qu.: 97.0 1st Qu.:-15.00
## Mode :character Median : 99.00 Median :105.0 Median : -8.00
## Mean : 99.71 Mean :105.3 Mean : -5.59
## 3rd Qu.:107.00 3rd Qu.:113.0 3rd Qu.: 5.00
## Max. :131.00 Max. :144.0 Max. : 29.00
##
## fscoreb_fscorea away_win home_win dateStr
## Min. :-29.00 Min. :0.000 Min. :0.000 2016-04-13:1829
## 1st Qu.: -5.00 1st Qu.:0.000 1st Qu.:0.000 2016-04-05:1452
## Median : 8.00 Median :0.000 Median :1.000 2016-04-08:1448
## Mean : 5.59 Mean :0.353 Mean :0.647 2016-04-11:1233
## 3rd Qu.: 15.00 3rd Qu.:1.000 3rd Qu.:1.000 2016-04-10:1168
## Max. : 38.00 Max. :1.000 Max. :1.000 2016-04-06:1108
## (Other) :4979
## teamaspread overunder teamaml teambml
## Min. :-13.000 Min. :180.5 Min. :-553.33 Min. :-750.0
## 1st Qu.: -3.500 1st Qu.:200.3 1st Qu.:-150.00 1st Qu.:-230.0
## Median : 4.833 Median :205.5 Median : 7.50 Median :-127.5
## Mean : 3.518 Mean :204.8 Mean : 26.41 Mean :-122.4
## 3rd Qu.: 9.500 3rd Qu.:209.5 3rd Qu.: 190.00 3rd Qu.: 115.0
## Max. : 19.000 Max. :225.2 Max. : 541.67 Max. : 410.0
##
## teambspread teama_vegas_fscore teamb_vegas_fscore
## Min. :-19.000 Min. : 84.38 Min. : 84.92
## 1st Qu.: -9.500 1st Qu.: 97.17 1st Qu.:100.50
## Median : -4.833 Median :100.25 Median :104.50
## Mean : -3.518 Mean :100.66 Mean :104.18
## 3rd Qu.: 3.500 3rd Qu.:103.50 3rd Qu.:108.75
## Max. : 13.000 Max. :115.75 Max. :119.12
##
## dateOrig ts teamlonga
## Min. :2016-04-05 Length:13217 Length:13217
## 1st Qu.:2016-04-08 Class :character Class :character
## Median :2016-04-11 Mode :character Mode :character
## Mean :2016-04-11
## 3rd Qu.:2016-04-14
## Max. :2016-04-24
##
## scorea teamlongb scoreb timestring
## Min. : 0.00 Length:13217 Min. : 0.00 Length:13217
## 1st Qu.: 28.00 Class :character 1st Qu.: 29.00 Class :character
## Median : 53.00 Mode :character Median : 56.00 Mode :character
## Mean : 52.85 Mean : 56.14
## 3rd Qu.: 77.00 3rd Qu.: 82.00
## Max. :131.00 Max. :144.00
##
## timeleft gameid teama teamb
## Min. : 0.00 Length:13217 cha : 796 bos : 889
## 1st Qu.:10.07 Class :character san : 716 mia : 804
## Median :22.73 Mode :character okc : 671 dal : 687
## Mean :22.40 cle : 639 tor : 639
## 3rd Qu.:34.25 tor : 578 hou : 593
## Max. :48.00 mem : 577 ind : 586
## (Other):9240 (Other):9019
## scorea_scoreb date scoreb_scorea pct_complete
## Min. :-44.000 Min. :2016-04-05 Min. :-33.000 Min. : 0.00
## 1st Qu.:-11.000 1st Qu.:2016-04-08 1st Qu.: -3.000 1st Qu.: 28.65
## Median : -3.000 Median :2016-04-11 Median : 3.000 Median : 52.64
## Mean : -3.293 Mean :2016-04-11 Mean : 3.293 Mean : 53.32
## 3rd Qu.: 3.000 3rd Qu.:2016-04-13 3rd Qu.: 11.000 3rd Qu.: 79.03
## Max. : 33.000 Max. :2016-04-24 Max. : 44.000 Max. :100.00
##
## pct_left cf1 cf2 teama_adj_fscore
## Min. : 0.00 Min. :-290.000 Min. :-11545.108 Min. : 66.42
## 1st Qu.: 20.97 1st Qu.: -2.362 1st Qu.: -0.044 1st Qu.: 93.50
## Median : 47.36 Median : 1.821 Median : 0.033 Median : 99.58
## Mean : 46.68 Mean : 4.799 Mean : 31.508 Mean : 99.85
## 3rd Qu.: 71.35 3rd Qu.: 9.017 3rd Qu.: 0.244 3rd Qu.:106.12
## Max. :100.00 Max. : 380.000 Max. : 15128.072 Max. :135.35
##
## teamb_adj_fscore pfscoreb_pfscorea dfa
## Min. : 75.50 Min. :-34.915 Min. :-26.7865
## 1st Qu.: 97.55 1st Qu.: -3.841 1st Qu.: -3.7944
## Median :104.41 Median : 5.383 Median : 0.6389
## Mean :104.79 Mean : 4.938 Mean : 0.4491
## 3rd Qu.:111.73 3rd Qu.: 14.159 3rd Qu.: 5.0549
## Max. :146.36 Max. : 47.362 Max. : 21.3115
##
# here we can see the trajectory of some of the games .....
# upper left beginning ... upper right (win), lower right (loss)
# cool visual .... gives an idea about how the games flow
tsplot <- filter(lrDF_filtered, grepl("cle", key) | grepl("gst", key))
scatterD3(x = tsplot$pct_complete, y = tsplot$scoreb_scorea, col_var = tsplot$key)
DQ_check <- ddply(lrDF_filtered, c("key"), summarise,
N = length(key))
# order by N
DQ_check <- DQ_check[order(DQ_check$N),]
p <- plot_ly(
x = DQ_check$Key,
y = DQ_check$N,
type = "bar")
p
# Wanted to save out the dataset at this point as I will branch into seperate work efforts for a Logistic/Linear model building
# drop some columns as we move on to next step !!
lrDF_final <- lrDF_filtered
lrDF_final$dateOrig <- NULL
lrDF_final$ts <- NULL
lrDF_final$teamlonga.x <- NULL
lrDF_final$teamlongb.x <- NULL
lrDF_final$teamlonga.y <- NULL
lrDF_final$teamlongb.y <- NULL
lrDF_final$timestring <- NULL
lrDF_final$gameid <- NULL
lrDF_final$teamaml <- NULL
lrDF_final$teambml <- NULL
lrDF_final$dfa <- NULL
lrDF_final$teama.y <- NULL
lrDF_final$teamb.y <- NULL
lrDF_final$dateStr <- NULL
names(lrDF_final)[c(8, 9)] <- c("teama", "teamb")
head(lrDF_final)
## key fscorea fscoreb fscorea_fscoreb fscoreb_fscorea
## 1 2016-04-05.cha.tor 90 96 -6 6
## 2 2016-04-05.cha.tor 90 96 -6 6
## 3 2016-04-05.cha.tor 90 96 -6 6
## 4 2016-04-05.cha.tor 90 96 -6 6
## 5 2016-04-05.cha.tor 90 96 -6 6
## 6 2016-04-05.cha.tor 90 96 -6 6
## away_win home_win teama teamb teambspread teama_vegas_fscore
## 1 0 1 4 200.5 -4 98.25
## 2 0 1 4 200.5 -4 98.25
## 3 0 1 4 200.5 -4 98.25
## 4 0 1 4 200.5 -4 98.25
## 5 0 1 4 200.5 -4 98.25
## 6 0 1 4 200.5 -4 98.25
## teamb_vegas_fscore teamlonga scorea teamlongb scoreb timeleft teama
## 1 102.25 Charlotte 42 Toronto 52 25.20000 cha
## 2 102.25 Charlotte 44 Toronto 58 20.93333 cha
## 3 102.25 Charlotte 19 Toronto 28 35.13333 cha
## 4 102.25 Charlotte 35 Toronto 41 28.18333 cha
## 5 102.25 Charlotte 40 Toronto 52 25.50000 cha
## 6 102.25 Charlotte 9 Toronto 12 43.30000 cha
## teamb scorea_scoreb date scoreb_scorea pct_complete pct_left
## 1 tor -10 2016-04-05 10 47.500000 52.50000
## 2 tor -14 2016-04-05 14 56.388889 43.61111
## 3 tor -9 2016-04-05 9 26.805556 73.19444
## 4 tor -6 2016-04-05 6 41.284722 58.71528
## 5 tor -12 2016-04-05 12 46.875000 53.12500
## 6 tor -3 2016-04-05 3 9.791667 90.20833
## cf1 cf2 teama_adj_fscore teamb_adj_fscore pfscoreb_pfscorea
## 1 6.884284 0.14286204 93.58125 105.6813 12.100000
## 2 10.569592 0.25452584 86.84792 102.5924 15.744444
## 3 5.250891 0.08348447 90.91354 102.8413 11.927778
## 4 3.906817 0.07411763 92.68776 101.0364 8.348611
## 5 8.212631 0.16881814 92.19531 106.3203 14.125000
## 6 1.577128 0.02120872 97.62969 104.2380 6.608333
write.csv(lrDF_final, file = "nba-datawrangle-lrDF.csv")